{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import Spider\n",
    "\n",
    "def highlight(text: str, keyword: str):\n",
    "    idx = text.lower().find(keyword.lower())\n",
    "    result = text\n",
    "    if idx >= 0:\n",
    "        ori_word = text[idx:idx+(len(keyword))]\n",
    "        result = text.replace(ori_word, f'*{ori_word}*')\n",
    "    return result\n",
    "\n",
    "def score(item, keyword: str):\n",
    "    title_score = item[1].lower().count(keyword.lower())\n",
    "    content_score = item[2].lower().count(keyword.lower())\n",
    "    score_result = title_score * 5 + content_score * 3\n",
    "    return score_result\n",
    "\n",
    "class MySearcher:\n",
    "    def __init__(self, scale: int=1):\n",
    "        \"\"\"实现基本功能的版本\"\"\"\n",
    "        self.docs = list()\n",
    "        self.load_data()\n",
    "        if scale > 1:\n",
    "            self.docs *= scale  # 文档规模倍增，用于测试搜索速度\n",
    "\n",
    "    def load_data(self, data_file_name='./news_list.pkl'):\n",
    "        if os.path.exists(data_file_name):\n",
    "            self.docs = Spider.pickle_load(data_file_name)\n",
    "        else:\n",
    "            Spider.pickle_save(data_file_name)\n",
    "            self.docs = Spider.pickle_load(data_file_name)\n",
    "\n",
    "    def search(self, keyword):\n",
    "        keyword = keyword.lower()\n",
    "        count = 0\n",
    "        sorted_result = list()\n",
    "        for item in self.docs:\n",
    "            if keyword in (item[1] or keyword in item[2]).lower():\n",
    "                sorted_result.append([count, score(item, keyword), item[1]])\n",
    "            count += 1\n",
    "        sorted_result.sort(key=lambda x: x[1], reverse=True)\n",
    "        return sorted_result\n",
    "\n",
    "    def render_search_result(self, keyword):\n",
    "        count = 0\n",
    "        for item in self.search(keyword):\n",
    "            count += 1\n",
    "            print(f'{count}[{item[1]}] {highlight(self.docs[item[0]][1], keyword)}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1[185] 传*华为*预计今年智能手机出货量同比减少60%，降至7000万部\n",
      "2[113] *华为*将在英国起诉汇丰，要拿到孟晚舟案关键文件\n",
      "3[101] *华为*推\"智慧养猪\"，任正非曾称如果养猪可能也是状元\n",
      "4[91] *华为*推“智慧养猪”，任正非：*华为*不靠手机也能活\n",
      "5[74] *华为*Mate X2镜头供应商为舜宇光学和欧菲光 屏幕为三星\n",
      "6[73] *华为*折叠屏手机*华为*MateX2将于下周一20时发布\n",
      "7[59] 除了欢迎拜登致电*华为*，任正非还谈了孟晚舟、退休时间、5G转让等\n",
      "8[58] *华为*供应链公司：已向*华为*P50手机供货，供货时间有延后\n",
      "9[56] 争国内第一！荣耀想打败*华为*小米，靠3599的V40行么？\n",
      "10[35] 急着剔除*华为*、中兴设备，美国政府又要掏19亿美元\n",
      "11[32] 外媒：夺回*华为*失去的市场，荣耀仍能重现辉煌\n"
     ]
    }
   ],
   "source": [
    "searcher = MySearcher()\n",
    "# searcher.search('华为')\n",
    "searcher.render_search_result('华为')"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 378 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "searcher_1x = MySearcher(scale=1)\n",
    "searcher_10x = MySearcher(scale=10)\n",
    "searcher_100x = MySearcher(scale=100)\n",
    "searcher_1000x = MySearcher(scale=1000)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 1.98 ms\n",
      "Wall time: 21.3 ms\n",
      "Wall time: 202 ms\n"
     ]
    }
   ],
   "source": [
    "%time r = searcher_10x.search('华为')\n",
    "%time r = searcher_100x.search('华为')\n",
    "%time r = searcher_1000x.search('华为')"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 19.2 ms\n",
      "Wall time: 203 ms\n"
     ]
    }
   ],
   "source": [
    "%time for i in range(10):r = searcher_10x.search('华为')\n",
    "%time for i in range(100):r = searcher_10x.search('华为')\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}